This vignette will help us explore the data available for
legumes - Downloading the data
- Sub-setting the data to agronomy and legumes
- Exploring geographic locations of studies
- Exploring crop types (products) in the data
- Exploring agronomic practices used in the studies
- Exploring the outcomes reported
# Set S3 path and initialize
s3 <- s3fs::S3FileSystem$new(anonymous = TRUE)
era_s3 <- "s3://digital-atlas/era"
bundle_dir <- file.path(era_s3, "data", "packaged")
# Get the latest bundle
all_files <- s3$dir_ls(bundle_dir)
latest_bundle <- tail(sort(grep("era_agronomy_bundle.*\\.tar\\.gz$", all_files, value = TRUE)), 1)
# Define download and extraction paths
dl_dir <- "downloaded_data"
dir.create(dl_dir, showWarnings = FALSE)
bundle_local <- file.path(dl_dir, basename(latest_bundle))
extract_dir <- file.path(dl_dir, tools::file_path_sans_ext(tools::file_path_sans_ext(basename(latest_bundle))))
# Download and extract
if (!file.exists(bundle_local)) {
s3$file_download(latest_bundle, bundle_local, overwrite = TRUE)
}
if (!dir.exists(extract_dir)) {
dir.create(extract_dir)
utils::untar(bundle_local, exdir = extract_dir)
}
# Locate files
json_agronomic <- list.files(extract_dir, pattern = "^agronomic_.*\\.json$", full.names = TRUE)
json_master <- list.files(extract_dir, pattern = "^era_master_codes.*\\.json$", full.names = TRUE)
parquet_file <- list.files(extract_dir, pattern = "^era_compiled.*\\.parquet$", full.names = TRUE)
# Load into variables
ERA_Compiled <- arrow::read_parquet(parquet_file)
# Download the era master vocab ######
era_vocab_url <- "https://github.com/peetmate/era_codes/raw/main/era_master_sheet.xlsx"
era_vocab_local <- file.path(dl_dir, basename(era_vocab_url))
if(!file.exists(era_vocab_local)|update_vocab){
download.file(era_vocab_url, era_vocab_local, mode = "wb") # Download and write in binary mode
}
# Import the vocab
sheet_names <- readxl::excel_sheets(era_vocab_local)
sheet_names <- sheet_names[!grepl("sheet|Sheet", sheet_names)]
# Read each sheet into a list named era_master_codes
era_master_codes <- sapply(
sheet_names,
FUN = function(x) {
data.table::data.table(readxl::read_excel(era_vocab_local, sheet = x))
},
USE.NAMES = TRUE
)
# Access and filter the era_fields_v2 sheet
era_fields_comp = era_master_codes["era_fields_v1"]